import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
df = pd.read_csv('netflix-report/CONTENT_INTERACTION/ViewingActivity.csv')
df = df.drop(columns=['Attributes', 'Supplemental Video Type', 'Bookmark', 'Latest Bookmark', 'Country'])
df['Start Time'] = pd.to_datetime(df['Start Time'], utc=True)
df['Duration'] = pd.to_timedelta(df['Duration']).dt.seconds/3600 #Convert duration to timedelta format and then to minutes
df['Profile Name'].value_counts()
G man 5832 Rons Rudums 5041 DMD 4224 Galerts 3049 MD_LD 2962 Name: Profile Name, dtype: int64
round(df['Duration'].sum(), 2)
6019.61
fig1 = px.pie(df, names='Profile Name', values='Duration', title='Percentage of total watchtime for each profile.')
fig1.show(renderer='notebook')
df["Device Type"].unique()
array(['Samsung CE 2021 Nike-L UHD TV Smart TV', 'Apple iPhone 13',
'Chrome PC (Cadmium)', 'LG 2017 RTK K3Lp Standard UHD TV Smart TV',
'DefaultWidevineAndroidPhone',
'Netflix Chrome MAC (Cadmium) HTML 5', 'LG 2014 LG SoC Smart TV',
'Android DefaultWidevineL3Phone Android Phone',
'LG 2015 LG SoC DTV Smart TV',
'Android DefaultWidevineL3Tablet Android Tablet',
'LG 2019 RTK K5Lp Standard UHD TV Smart TV',
'LG 2018 Mstar M3 Standard DTV Smart TV', 'Firefox PC (Cadmium)'],
dtype=object)
df = df.replace(to_replace=["Samsung CE 2021 Nike-L UHD TV Smart TV", "LG 2017 RTK K3Lp Standard UHD TV Smart TV", "LG 2014 LG SoC Smart TV", "LG 2015 LG SoC DTV Smart TV", "LG 2019 RTK K5Lp Standard UHD TV Smart TV", "LG 2018 Mstar M3 Standard DTV Smart TV"], value="TV")
df = df.replace(to_replace=['Apple iPhone 13', 'DefaultWidevineAndroidPhone', 'Android DefaultWidevineL3Phone Android Phone'], value='Phone')
df = df.replace(to_replace=['Chrome PC (Cadmium)', 'Netflix Chrome MAC (Cadmium) HTML 5', 'Firefox PC (Cadmium)'], value='Web Browser')
df = df.replace(to_replace='Android DefaultWidevineL3Tablet Android Tablet', value='Tablet')
fig2 = px.histogram(df, x='Profile Name', y='Duration', color='Device Type', text_auto=True, title='Watch time on different devices in hours.')
fig2.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig2.show(renderer='notebook')
df2 = pd.DataFrame()
df2['Date'] = pd.to_datetime(df['Start Time']).dt.date
df2['Duration'] = df['Duration']
watch_time = df2.groupby(['Date'])['Duration'].sum().reset_index()
df2 = pd.DataFrame(watch_time)
fig3 = px.line(df2, x='Date', y='Duration', title='Total time spent watching per day for all profiles in hours.')
fig3.show(renderer='notebook')
df2['Date'] = pd.to_datetime(df2['Date'])
df2['Weekday'] = df2['Date'].dt.day_name()
fig4 = px.histogram(df2, x='Weekday', y='Duration', title='Time spent watching per weekday in hours.')
fig4.show(renderer='notebook')
df['Title'] = df['Title'].astype(str).str.split(':').str[0] #Modify original df to remove everything after ':' in titles.
show_watch_time = df.groupby(['Title'])['Duration'].sum().reset_index()
show_watch_time = show_watch_time.sort_values(by='Duration', ascending=False).head(25)
df3 = pd.DataFrame(show_watch_time)
df3 = df3.sample(frac=1)#Randomize row order to make graph more visually pleasing
fig5 = px.scatter(df3, x='Title', y='Duration', size='Duration', color='Duration', title='Top 25 most watched shows on all profiles.')
fig5.show(renderer='notebook')